{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-27T07:17:29.890648Z",
     "start_time": "2018-11-27T07:17:29.622447Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\programdata\\anaconda3\\envs\\tensorflow\\lib\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "error rate is: 0.1\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import random\n",
    "from numpy import *\n",
    "\n",
    "#tokenization,分词\n",
    "def textParse(s):\n",
    "    tokens = re.split(r'\\W*', s)\n",
    "    #转化成小写，且只取长度大于2的单词\n",
    "    return [tok.lower() for tok in tokens if len(tok) > 2]\n",
    "\n",
    "#通过set来创建无重复单词的字典\n",
    "def createVocab(fullText):\n",
    "    return list(set(fullText))\n",
    "\n",
    "#将一个单词list转化为向量表示, 某单词存在，则把vocabs中对应位置赋为1(此处应用的是伯努利模型，即不考虑次数，只考虑是否出现)\n",
    "def words2Vec(vocabs, words):\n",
    "    vec = [0 for _ in range(len(vocabs))]\n",
    "    for word in words:\n",
    "        if word in vocabs:\n",
    "            vec[vocabs.index(word)] = 1\n",
    "    return vec\n",
    "        \n",
    "#训练过程\n",
    "def trainNB(trainMat, trainClasses):\n",
    "    numDocs = len(trainMat)\n",
    "    numWords = len(trainMat[0])\n",
    "    #垃圾邮件的概率\n",
    "    pSpam = sum(trainClasses)/float(numDocs)\n",
    "    #分子p0是类别为0的概率,add-1 smmothing\n",
    "    p0num = ones(numWords)\n",
    "    p1num = ones(numWords)\n",
    "    #分母,add-1 smoothing\n",
    "    p0denom = 2.0\n",
    "    p1denom = 2.0\n",
    "    for i in range(numDocs):\n",
    "        #计算1类的分子分母,0类的分子分母\n",
    "        if trainClasses[i] == 1:\n",
    "            p1num += trainMat[i]\n",
    "            p1denom += sum(trainMat[i])\n",
    "        else:\n",
    "            p0num += trainMat[i]\n",
    "            p0denom += sum(trainMat[i])\n",
    "    #计算概率，p1,p0,这里两者均为向量表示，每个位置时该位置对应的单词的概率p1[i] = p(wi|c=1), p0[i] = p(wi|c=0)\n",
    "    #取自然对数是为了转化乘法为加法，防止向下溢出\n",
    "    p1 = log(p1num/p1denom)\n",
    "    p0 = log(p0num/p0denom)\n",
    "    return p0, p1, pSpam\n",
    "\n",
    "#分类过程,传入的概率p0和p1都是取了自然对数的,pSpam没有取\n",
    "def classifyNB(wordvec,p0vec,p1vec,pSpam):\n",
    "    p1 = sum(wordvec*p1vec) + log(pSpam)\n",
    "    p0 = sum(wordvec*p0vec) + log(1.0-pSpam)\n",
    "    #返回概率大的类别\n",
    "    if p1 > p0:\n",
    "        return 1\n",
    "    else:\n",
    "        return 0\n",
    "\n",
    "    \n",
    "def spamTest():\n",
    "    docs = []\n",
    "    classes = []\n",
    "    fullText = []\n",
    "    #总共有25个正例,25个反例\n",
    "    for i in range(1,26):\n",
    "        #每封邮件作为一个大字符串，使用textParse分词放入list\n",
    "        #docs存放每一封分词过后的邮件,fullText存放所有的单词,classes存放类别（spam中是正类）\n",
    "        words = textParse(open('email/spam/%d.txt'%i).read())\n",
    "        docs.append(words)\n",
    "        fullText.extend(words)\n",
    "        classes.append(1)\n",
    "        #同理求负例\n",
    "        words = textParse(open('email/ham/%d.txt'%i,encoding='gbk').read())\n",
    "        docs.append(words)\n",
    "        fullText.extend(words)\n",
    "        classes.append(0)\n",
    "    #构建字典\n",
    "    vocabs = createVocab(fullText)\n",
    "    #总共50封邮件，随机选择10封作为测试集，剩余40封为训练集，trainIndex和testIndex存的是选取的邮件的index\n",
    "    trainIndex = [i for i in range(50)]\n",
    "    testIndex = []\n",
    "    for i in range(10):\n",
    "        randIndex = int(random.uniform(0,len(trainIndex)))\n",
    "        testIndex.append(trainIndex[randIndex])\n",
    "        del trainIndex[randIndex]\n",
    "    #对于训练集，将每封邮件的单词列表转化成向量表示， 并存入相应的list\n",
    "    trainMat = []\n",
    "    trainClasses = []\n",
    "    for index in trainIndex:\n",
    "        trainMat.append(words2Vec(vocabs, docs[index]))\n",
    "        trainClasses.append(classes[index])\n",
    "    #训练模型，得到条件概率向量，以及先验概率pSpam\n",
    "    p0, p1, pSpam = trainNB(array(trainMat),array(trainClasses))\n",
    "    #在测试集上测试\n",
    "    errorCount = 0\n",
    "    for index in testIndex:\n",
    "        wordvec = words2Vec(vocabs, docs[index])\n",
    "        if classifyNB(array(wordvec), p0, p1, pSpam) != classes[index]:\n",
    "            errorCount += 1\n",
    "    print('error rate is:', float(errorCount)/len(testIndex))\n",
    "    \n",
    "spamTest()\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-11-27T07:18:11.422690Z",
     "start_time": "2018-11-27T07:18:11.223557Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "error rate is: 0.09999999999999998\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\programdata\\anaconda3\\envs\\tensorflow\\lib\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    }
   ],
   "source": [
    "#使用sklearn工具包进行分类\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "def spamNBsklearn():\n",
    "    #数据准备过程同上\n",
    "    docs = []\n",
    "    classes = []\n",
    "    fullText = []\n",
    "    for i in range(1,26):\n",
    "        words = textParse(open('email/spam/%d.txt'%i).read())\n",
    "        docs.append(words)\n",
    "        fullText.extend(words)\n",
    "        classes.append(1)\n",
    "        #同理求负例\n",
    "        words = textParse(open('email/ham/%d.txt'%i,encoding='gbk').read())\n",
    "        docs.append(words)\n",
    "        fullText.extend(words)\n",
    "        classes.append(0)\n",
    "    #构建字典\n",
    "    vocabs = createVocab(fullText)\n",
    "    #总共50封邮件，随机选择10封作为测试集，剩余40封为训练集，trainIndex和testIndex存的是选取的邮件的index\n",
    "    trainIndex = [i for i in range(50)]\n",
    "    testIndex = []\n",
    "    for i in range(10):\n",
    "        randIndex = int(random.uniform(0,len(trainIndex)))\n",
    "        testIndex.append(trainIndex[randIndex])\n",
    "        del trainIndex[randIndex]\n",
    "    #对于训练集，将每封邮件的单词列表转化成向量表示， 并存入相应的list\n",
    "    trainMat = []\n",
    "    trainClasses = []\n",
    "    for index in trainIndex:\n",
    "        trainMat.append(words2Vec(vocabs, docs[index]))\n",
    "        trainClasses.append(classes[index])\n",
    "    #对于测试集，将每封邮件的单词列表转化成向量表示， 并存入相应的list\n",
    "    testMat = []\n",
    "    testClasses = []\n",
    "    for index in testIndex:\n",
    "        testMat.append(words2Vec(vocabs, docs[index]))\n",
    "        testClasses.append(classes[index])\n",
    "    #使用sklearn包训练（使用多项式模型）\n",
    "    clf = MultinomialNB()\n",
    "    clf.fit(trainMat, trainClasses)\n",
    "    #test, clf.score 输出对测试样本的预测准确率平均值\n",
    "    score = clf.score(testMat, testClasses)\n",
    "    print('error rate is:', 1-score)\n",
    "\n",
    "spamNBsklearn()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
